knitr::opts_knit$set(root.dir = ".")
# load libraries
library(cowplot) # plot_grid()
library(dplyr) # left_join()
library(ggplot2) # ggplot()
library(gridExtra) # grid.arrange()
library(harmony) # RunHarmony()
library(parallel) # detectCores()
library(Seurat) # Read10X_h5()
library(stringr) # str_match()
# variables
sample_order <- c("E3.2M.M","E3.2M.F","E4.2M.M","E4.2M.F",
"E3.14M.M","E3.14M.F","E4.14M.M","E4.14M.F")
age_order <- c("2 months","14 months")
sex_order <- c("Male","Female")
isoform_order <- c("E3","E4")
sample_colors <- c("gray","red","orange","yellow","green","blue","purple","pink")
age_colors <- c("darkgray","chartreuse3")
sex_colors <- c("darkgray","purple")
isoform_colors <- c("darkgray","cornflowerblue")
# thresholds
nCount.min <- 400
nCount.max <- 25000
nFeature.min <- 250
complexity.cutoff <- 0.8
mt.cutoff <- 20
ribo.cutoff <- 20
hb.cutoff <- 3
# set seed
set.seed(8)
# work in parallel
options(mc.cores = detectCores() - 1)
These functions with help simultaneously save plots as a png and pdf.
saveToPDF <- function(...) {
d = dev.copy(pdf,...)
dev.off(d)
}
saveToPNG <- function(...) {
d = dev.copy(png,...)
dev.off(d)
}
prefix <- "../../cellbender/"
suffix <- "_fpr_0.05_filtered.h5"
if (file.exists("../../rObjects/mouse_merged_h5.rds")) {
mouse <- readRDS("../../rObjects/mouse_merged_h5.rds")
} else {
# individual sample objects
E3.2M.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"pass2_","E3_2M_F",suffix)))
E3.2M.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_2M_M",suffix)))
E3.14M.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_14M_F",suffix)))
E3.14M.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_14M_M",suffix)))
E4.2M.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_2M_F",suffix)))
E4.2M.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_2M_M",suffix)))
E4.14M.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"pass2_","E4_14M_F",suffix)))
E4.14M.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_14M_M",suffix)))
# merge objects
mouse <- merge(x = E3.2M.F,
y = c(E3.2M.M, E3.14M.F, E3.14M.M,
E4.2M.F, E4.2M.M, E4.14M.F, E4.14M.M),
add.cell.ids = c("E3.2M.F","E3.2M.M","E3.14M.F","E3.14M.M",
"E4.2M.F","E4.2M.M","E4.14M.F","E4.14M.M"),
project = "Mouse scRNAseq")
# cleanup and save
remove(E3.2M.F, E3.2M.M, E3.14M.F, E3.14M.M, E4.2M.F, E4.2M.M, E4.14M.F, E4.14M.M)
saveRDS(mouse, "../../rObjects/mouse_merged_h5.rds")
}
# preview
mouse
## An object of class Seurat
## 32285 features across 66679 samples within 1 assay
## Active assay: RNA (32285 features, 0 variable features)
# read in annotation file, GENCODE GRCm38 version M23 (Ensembl 98)
if (file.exists("../../rObjects/annotation.rds")) {
genes <- readRDS("../../rObjects/annotation.rds")
} else {
gtf.file <- "../../refs/mouse_genes.gtf"
genes <- rtracklayer::import(gtf.file)
genes <- as.data.frame(genes)
genes <- genes[genes$type == "gene",]
saveRDS(genes, "../../rObjects/annotation.rds")
}
nCount_RNA = total number of transcripts (UMIs) in a single cell nFeature_RNA = number of unique transcripts (features)
# create sample column
barcodes <- colnames(mouse)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[,2]
table(sample)
## sample
## E3.14M.F E3.14M.M E3.2M.F E3.2M.M E4.14M.F E4.14M.M E4.2M.F E4.2M.M
## 7221 8672 10862 7974 11021 7912 6805 6212
mouse$sample <- factor(sample, levels = sample_order)
table(mouse$sample) # check
##
## E3.2M.M E3.2M.F E4.2M.M E4.2M.F E3.14M.M E3.14M.F E4.14M.M E4.14M.F
## 7974 10862 6212 6805 8672 7221 7912 11021
Idents(mouse) <- mouse$sample
# age column
age <- str_match(mouse$sample, "E\\d.(\\d+)M.[FM]")[,2]
age <- gsub(2,"2 months",age)
age <- gsub(14,"14 months",age)
mouse$age <- factor(age, levels = age_order)
# sex column
sex <- str_match(mouse$sample, "E\\d.\\d+M.([FM])")[,2]
sex <- gsub("F","Female",sex)
sex <- gsub("M","Male",sex)
mouse$sex <- factor(sex, levels = sex_order)
# Apoe isoform column
isoform <- str_match(mouse$sample, "(E\\d).\\d+M.[FM]")[,2]
mouse$isoform <- factor(isoform, levels = isoform_order)
# cell.complexity
mouse$cell.complexity <- log10(mouse$nFeature_RNA) / log10(mouse$nCount_RNA)
# percent.mt
mt.genes <- genes[genes$seqnames == "chrM",13]
mouse$percent.mt <- PercentageFeatureSet(mouse, features = mt.genes)
mt.genes
## [1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3"
## [8] "mt-Nd3" "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
# percent.ribo
# ribosomal proteins begin with 'Rps' or 'Rpl' in this annotation file
# mitochondrial ribosomes start with 'Mrps' or 'Mrpl'
gene.names <- genes$gene_name
ribo <- gene.names[grep("^Rp[sl]", gene.names)]
mt.ribo <- gene.names[grep("^Mrp[sl]", gene.names)]
ribo.combined <- c(mt.ribo,ribo)
mouse$percent.ribo.protein <- PercentageFeatureSet(mouse, features = ribo.combined)
ribo.combined
## [1] "Mrpl15" "Mrpl30" "Mrps9" "Mrpl44" "Mrps14"
## [6] "Mrpl41" "Mrps2" "Mrps5" "Mrps26" "Mrps28"
## [11] "Mrpl47" "Mrpl24" "Mrpl9" "Mrps21" "Mrpl50"
## [16] "Mrpl37" "Mrps15" "Mrpl20" "Mrpl33" "Mrpl1"
## [21] "Mrps18c" "Mrps17" "Mrps33" "Mrpl35" "Mrpl19"
## [26] "Mrpl53" "Mrps25" "Mrpl51" "Mrps35" "Mrps12"
## [31] "Mrpl46" "Mrps11" "Mrpl48" "Mrpl17" "Mrpl23"
## [36] "Mrps31" "Mrpl34" "Mrpl4" "Mrps22" "Mrpl3"
## [41] "Mrpl54" "Mrpl42" "Mrps24" "Mrpl22" "Mrpl55"
## [46] "Mrps23" "Mrpl27" "Mrpl10" "Mrpl45" "Mrpl58"
## [51] "Mrps7" "Mrpl38" "Mrpl12" "Mrpl32" "Mrpl36"
## [56] "Mrps27" "Mrps36" "Mrps30" "Mrps16" "Mrpl52"
## [61] "Mrpl57" "Mrpl13" "Mrpl40" "Mrpl39" "Mrps6"
## [66] "Mrpl18" "Mrps34" "Mrpl28" "Mrps18b" "Mrpl14"
## [71] "Mrps18a" "Mrpl2" "Mrps10" "Mrpl21" "Mrpl11"
## [76] "Mrpl49" "Mrpl16" "Mrpl43" "Rpl7" "Rpl31"
## [81] "Rpl37a" "Rps6kc1" "Rpl7a" "Rpl12" "Rpl35"
## [86] "Rps21" "Rpl22l1" "Rps3a1" "Rps27" "Rpl34"
## [91] "Rps20" "Rps6" "Rps8" "Rps6ka1" "Rpl11"
## [96] "Rpl22" "Rpl9" "Rpl5" "Rplp0" "Rpl6"
## [101] "Rpl21" "Rpl32" "Rps9" "Rpl28" "Rps5"
## [106] "Rps19" "Rps16" "Rps11" "Rpl13a" "Rpl18"
## [111] "Rps17" "Rps3" "Rpl27a" "Rps13" "Rps15a"
## [116] "Rplp2" "Rpl18a" "Rpl13" "Rps25" "Rpl10-ps3"
## [121] "Rplp1" "Rpl4" "Rps27l" "Rpl29" "Rps27rt"
## [126] "Rpsa" "Rpl14" "Rps12" "Rps15" "Rpl41"
## [131] "Rps26" "Rps27a" "Rpl26" "Rpl23a" "Rpl9-ps1"
## [136] "Rps6kb1" "Rpl23" "Rpl19" "Rpl27" "Rpl38"
## [141] "Rps7" "Rpl10l" "Rps29" "Rpl36al" "Rps6kl1"
## [146] "Rps6ka5" "Rps23" "Rpl15" "Rps24" "Rpl36a-ps1"
## [151] "Rpl37" "Rpl30" "Rpl8" "Rpl3" "Rps19bp1"
## [156] "Rpl39l" "Rpl35a" "Rpl24" "Rps6ka2" "Rps2"
## [161] "Rpl3l" "Rps10" "Rpl10a" "Rps28" "Rps18"
## [166] "Rpl7l1" "Rpl36" "Rpl36-ps4" "Rps14" "Rpl17"
## [171] "Rps6kb2" "Rps6ka4" "Rpl9-ps6" "Rpl39" "Rpl10"
## [176] "Rps4x" "Rps6ka6" "Rpl36a" "Rps6ka3"
# percent.hb
# percent.hb - hemoglobin proteins begin with 'Hbb' or 'Hba' for mouse
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
mouse$percent.hb <- PercentageFeatureSet(mouse, features = hb.genes)
hb.genes
## [1] "Hbb-bt" "Hbb-bs" "Hbb-bh2" "Hbb-bh1" "Hbb-y" "Hba-x" "Hba-a1"
## [8] "Hba-a2"
# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse$sample))
colnames(data) <- c("sample","frequency")
ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,12000, by = 3000), limits = c(0,12000)) +
ggtitle("Raw: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1
# Visualize nCount_RNA
den1 <- ggplot(mouse@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
theme(legend.position = "none") +
geom_vline(xintercept = nCount.min) +
geom_vline(xintercept = nCount.max) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize nFeature_RNA
den2 <- ggplot(mouse@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.mt
den4 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.ribo.protein
den5 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.ribo.protein,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = ribo.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Ribosomal Protein Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.hb
den6 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = hb.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(mouse,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v1
# percent violins
v2 <- VlnPlot(mouse,
features = c("percent.mt","percent.ribo.protein","percent.hb"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v2
s1 <- ggplot(
mouse@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min) +
geom_hline(yintercept = nFeature.min) +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s1
## `geom_smooth()` using formula 'y ~ x'
s2 <- FeatureScatter(mouse,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s2
# filter
mouse.filtered <- subset(mouse,
subset = (nCount_RNA > nCount.min) &
(nCount_RNA < nCount.max) &
(nFeature_RNA > nFeature.min) &
(cell.complexity > complexity.cutoff) &
(percent.mt < mt.cutoff) &
(percent.ribo.protein < ribo.cutoff) &
(percent.hb < hb.cutoff))
# print cells removed
print(paste0(dim(mouse)[2] - dim(mouse.filtered)[2]," cells removed"))
## [1] "31530 cells removed"
Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.
# filter genes
counts <- GetAssayData(object = mouse.filtered, slot = "counts")
nonzero <- counts > 0 # produces logical
keep <- Matrix::rowSums(nonzero) >= 10 # sum the true/false
counts.filtered <- counts[keep,] # keep certain genes
# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
meta.data = mouse.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "11729 features removed"
Multiple passes were made to assess whether mitochondrial, ribosomal, and immunoglobulin genes should be removed. Ultimately, removal of these genes enhanced clustering.
# remove mt.genes
counts <- GetAssayData(object = mouse.filtered, slot = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]
# remove ribo.genes
keep <- !rownames(counts.filtered) %in% ribo.combined
counts.filtered <- counts.filtered[keep,]
# remove Ig genes + Jchain but keep Igha + Ighd to enahnce clustering
gene.types <- c("IG_C_gene","IG_C_pseudogene","IG_D","IG_J_gene","IG_LV_gene",
"IG_V_gene","IG_V_pseudogene")
keep <- (genes$gene_type) %in% gene.types
ig.genes <- genes[keep,]
ig.genes <- c(ig.genes$gene_name, "Jchain")
ig.genes <- ig.genes[-c(185,192)] # keep Igha and Ighd
ig.genes
## [1] "Gm17472" "Gm20730" "Igkv2-137" "Igkv1-136" "Igkv1-135"
## [6] "Igkv14-134-1" "Igkv17-134" "Igkv1-133" "Gm43284" "Igkv1-132"
## [11] "Igkv1-131" "Igkv14-130" "Igkv9-129" "Igkv9-128" "Igkv17-127"
## [16] "Igkv14-126-1" "Igkv14-126" "Igkv11-125" "Igkv9-124" "Igkv9-123"
## [21] "Igkv1-122" "Igkv17-121" "Igkv9-120" "Igkv9-119" "Igkv14-118-2"
## [26] "Igkv14-118-1" "Gm43739" "Igkv11-118" "Igkv1-117" "Igkv2-116"
## [31] "Igkv1-115" "Igkv11-114" "Igkv2-113" "Igkv2-112" "Igkv14-111"
## [36] "Igkv1-110" "Igkv2-109" "Igkv1-108" "Igkv2-107" "Igkv11-106"
## [41] "Igkv2-105" "Igkv16-104" "Igkv15-103" "Igkv15-102" "Igkv20-101-2"
## [46] "Igkv15-101-1" "Igkv15-101" "Igkv14-100" "Igkv1-99" "Igkv12-98"
## [51] "Igkv15-97" "Igkv10-96" "Igkv2-95-2" "Igkv2-95-1" "Igkv10-95"
## [56] "Igkv10-94" "Igkv2-93-1" "Igkv19-93" "Igkv4-92" "Gm42925"
## [61] "Igkv4-91" "Igkv4-90" "Gm42924" "Igkv12-89" "Igkv1-88"
## [66] "Gm42539" "Gm42543" "Gm42542" "Igkv13-87" "Igkv4-86"
## [71] "Igkv13-85" "Igkv13-84" "Igkv4-83" "Igkv13-82" "Igkv4-81"
## [76] "Igkv13-80-1" "Igkv4-80" "Igkv4-79" "Igkv13-78-1" "Igkv4-78"
## [81] "Gm43583" "Igkv4-77" "Igkv13-76" "Igkv4-75" "Igkv13-74-1"
## [86] "Igkv4-74" "Igkv13-73-1" "Igkv4-73" "Igkv4-72" "Igkv13-71-1"
## [91] "Igkv4-71" "Igkv4-70" "Igkv4-69" "Igkv4-68" "Igkv12-67"
## [96] "Igkv12-66" "Igkv4-65" "Igkv13-64" "Igkv4-63" "Igkv13-62-1"
## [101] "Igkv4-62" "Igkv13-61-1" "Igkv4-61" "Igkv4-59" "Igkv4-60"
## [106] "Igkv4-58" "Igkv13-57-2" "Igkv4-57-1" "Igkv13-57-1" "Igkv4-57"
## [111] "Igkv13-56-1" "Igkv4-56" "Igkv13-55-1" "Igkv4-55" "Igkv13-54-1"
## [116] "Igkv4-54" "Igkv4-53" "Gm42606" "Igkv4-51" "Gm42958"
## [121] "Igkv4-50" "Igkv12-49" "Igkv5-48" "Igkv12-47" "Igkv12-46"
## [126] "Igkv5-45" "Igkv12-44" "Igkv5-43" "Igkv12-42" "Igkv12-41"
## [131] "Igkv5-40-1" "Igkv12-40" "Igkv5-39" "Gm43220" "Igkv12-38"
## [136] "Igkv5-37" "Igkv18-36" "Igkv1-35" "Gm43586" "Igkv8-34"
## [141] "Igkv7-33" "Igkv6-32" "Igkv8-31" "Igkv8-30" "Igkv6-29"
## [146] "Igkv8-28" "Igkv8-27" "Igkv8-26" "Igkv6-25" "Igkv8-24"
## [151] "Gm43218" "Igkv6-23" "Igkv8-22" "Igkv8-21" "Igkv6-20"
## [156] "Igkv8-19" "Igkv8-18" "Igkv6-17" "Igkv8-16" "Gm42667"
## [161] "Igkv6-15" "Gm10360" "Igkv6-14" "Igkv6-13" "Gm42720"
## [166] "Igkv3-12-1" "Igkv3-12" "Igkv3-11" "Igkv3-10" "Igkv3-9"
## [171] "Igkv3-8" "Igkv3-7" "Igkv3-6" "Igkv3-5" "Igkv3-4"
## [176] "Igkv3-3" "Igkv3-2" "Igkv3-1" "Igkj1" "Igkj2"
## [181] "Igkj3" "Igkj4" "Igkj5" "Igkc" "Ighe"
## [186] "Ighg2c" "Ighg2b" "Gm37944" "Ighg1" "Ighg3"
## [191] "Ighm" "Ighj4" "Ighj3" "Ighj2" "Ighj1"
## [196] "Ighv5-1" "Ighv2-1" "Ighv5-2" "Ighv2-2" "Ighv5-3"
## [201] "Ighv5-4" "Ighv6-1" "Ighv2-3" "Ighv5-5" "Ighv5-6"
## [206] "Ighv5-7" "Ighv2-4" "Ighv5-8" "Ighv5-8" "Ighv5-9"
## [211] "Ighv5-10" "Ighv2-5" "Ighv5-11" "Ighv5-12" "Ighv2-6"
## [216] "Gm37976" "Gm37418" "Ighv5-9-1" "Gm38203" "Ighv5-12-4"
## [221] "Gm38205" "Ighv2-9-1" "Gm37722" "Ighv5-13" "Ighv2-6-8"
## [226] "Gm7003" "Ighv2-7" "Ighv5-15" "Ighv5-16" "Ighv5-17"
## [231] "Ighv5-18" "Ighv2-8" "Ighv2-9" "Ighv5-19" "Ighv7-1"
## [236] "Ighv7-2" "Ighv14-1" "Ighv4-1" "Ighv3-1" "Gm37961"
## [241] "Ighv11-1" "Ighv14-2" "Ighv4-2" "Ighv3-2" "Ighv11-2"
## [246] "Ighv14-3" "Ighv16-1" "Ighv6-2" "Ighv9-1" "Ighv12-1"
## [251] "Ighv9-2" "Ighv12-2" "Ighv9-3" "Ighv7-3" "Ighv15-1"
## [256] "Ighv14-4" "Ighv3-3" "Ighv7-4" "Ighv3-4" "Ighv3-5"
## [261] "Ighv13-1" "Ighv3-6" "Ighv9-4" "Ighv3-7" "Ighv5-21"
## [266] "Ighv3-8" "Ighv8-1" "Ighv1-1" "Ighv13-2" "Ighv12-3"
## [271] "Ighv6-3" "Ighv6-4" "Ighv6-5" "Ighv6-6" "Ighv6-7"
## [276] "Ighv8-2" "Ighv1-2" "Ighv10-1" "Ighv1-3" "Ighv1-4"
## [281] "Ighv10-2" "Ighv1-5" "Ighv10-3" "Ighv1-6" "Ighv1-7"
## [286] "Ighv10-4" "Ighv1-8" "Ighv15-2" "Ighv1-9" "Ighv1-10"
## [291] "Ighv1-11" "Ighv1-12" "Ighv1-13" "Ighv1-13" "Ighv1-14"
## [296] "Ighv1-15" "Ighv1-16" "Ighv1-17-1" "Ighv1-17" "Ighv1-18"
## [301] "Ighv1-19-1" "Ighv1-19" "Ighv1-20" "Ighv1-21-1" "Ighv1-21"
## [306] "Ighv1-22" "Ighv1-23" "Ighv1-24" "Ighv1-25" "Ighv1-26"
## [311] "Ighv1-27" "Ighv1-28" "Ighv1-29" "Ighv1-30" "Ighv1-31"
## [316] "Ighv1-32" "Ighv1-33" "Ighv1-34" "Ighv1-35" "Ighv1-36"
## [321] "Ighv1-37" "Ighv1-38" "Ighv1-39" "Ighv1-40" "Ighv1-41"
## [326] "Ighv1-42" "Ighv1-43" "Ighv1-44" "Ighv1-45" "Ighv1-46"
## [331] "Ighv1-47" "Ighv8-3" "Ighv8-4" "Ighv1-48" "Ighv1-49"
## [336] "Ighv8-5" "Ighv1-50" "Ighv1-51" "Ighv1-52" "Ighv1-53"
## [341] "Ighv8-6" "Ighv1-54" "Ighv1-55" "Ighv1-56" "Ighv8-7"
## [346] "Ighv1-57" "Ighv8-8" "Ighv1-58" "Ighv1-59" "Ighv1-60"
## [351] "Ighv1-61" "Ighv1-62" "Ighv1-62-1" "Gm37511" "Gm38184"
## [356] "Ighv1-62-2" "Ighv1-62-3" "Ighv8-9" "Ighv1-63" "Ighv1-64"
## [361] "Ighv8-10" "Ighv1-65" "Ighv8-11" "Ighv1-66" "Ighv1-67"
## [366] "Ighv1-68" "Ighv1-69" "Ighv8-12" "Ighv1-70" "Ighv1-71"
## [371] "Ighv1-72" "Ighv8-13" "Ighv1-73" "Ighv1-74" "Ighv8-14"
## [376] "Ighv1-75" "Ighv8-15" "Ighv1-76" "Ighv8-16" "Ighv1-77"
## [381] "Ighv1-78" "Gm42643" "Ighv1-79" "Ighv1-80" "Ighv1-81"
## [386] "Gm42990" "Ighv1-82" "Ighv1-83" "Ighv1-84" "Ighv1-85"
## [391] "Ighv1-86" "Iglc1" "Iglj1" "Iglc3" "Iglj3p"
## [396] "Iglj3" "Iglv1" "Iglc4" "Iglj4" "Iglc2"
## [401] "Iglj2" "Iglv3" "Iglv2" "Gm50428" "Gm50427"
## [406] "Gm50426" "Jchain"
keep <- !rownames(counts.filtered) %in% ig.genes
counts.filtered <- counts.filtered[keep,]
# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
meta.data = mouse.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "305 features removed"
# cleanup data
remove(mouse,counts,counts.filtered,nonzero,data)
# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse.filtered$sample))
colnames(data) <- c("sample","frequency")
ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,12000, by = 3000), limits = c(0,12000)) +
ggtitle("Filtered: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
# Arrange graphs in grid
plots <- list(ncells1,ncells2)
layout <- cbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# Visualize nCount_RNA
den1 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
theme(legend.position = "none") +
geom_vline(xintercept = nCount.min) +
geom_vline(xintercept = nCount.max) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize nFeature_RNA
den2 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff) +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.mt
den4 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.ribo.protein
den5 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.ribo.protein,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = ribo.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Ribosomal Protein Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.hb
den6 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = hb.cutoff) +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(mouse.filtered,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v3
# percent violins
v4 <- VlnPlot(mouse.filtered,
features = c("percent.mt","percent.ribo.protein","percent.hb"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v4
s3 <- ggplot(
mouse.filtered@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min) +
geom_hline(yintercept = nFeature.min) +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s3
## `geom_smooth()` using formula 'y ~ x'
s4 <- FeatureScatter(mouse.filtered,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s4
# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(mouse.filtered@meta.data,
aes(x = sample,
y = log10(nFeature_RNA),
fill=sample)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ggtitle("Unique Genes / Cell / Sample") +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("Sample")
b1
df <- data.frame(row.names = rownames(mouse.filtered))
df$rsum <- rowSums(x = mouse.filtered, slot = "counts")
df$gene_name <- rownames(df)
df <- df[order(df$rsum, decreasing = TRUE),]
head(df, 30)
## rsum gene_name
## Gm42418 31600395 Gm42418
## Malat1 8955952 Malat1
## Tmsb4x 1694214 Tmsb4x
## S100a9 1364471 S100a9
## Actb 1042136 Actb
## Cd74 1001399 Cd74
## S100a8 1000450 S100a8
## AY036118 885982 AY036118
## Apoe 737847 Apoe
## Lyz2 734883 Lyz2
## Tpt1 633664 Tpt1
## Fth1 633423 Fth1
## Cst3 589509 Cst3
## Mgp 487094 Mgp
## Igfbp5 471291 Igfbp5
## Eef1a1 452645 Eef1a1
## Itm2b 445733 Itm2b
## Ftl1 432500 Ftl1
## Fau 413255 Fau
## H2-Aa 402144 H2-Aa
## H2-Ab1 380160 H2-Ab1
## H2-Eb1 371556 H2-Eb1
## C1qa 368105 C1qa
## Mrc1 356628 Mrc1
## Lars2 307448 Lars2
## Selenop 306359 Selenop
## Ahnak 299493 Ahnak
## C1qb 274284 C1qb
## B2m 272551 B2m
## Ptma 267865 Ptma
# log normalization
mouse.phase <- NormalizeData(mouse.filtered)
# load mouse cell cycle markers
phase.markers <- read.delim("../../refs/mouse_cell_cycle.txt")
colnames(phase.markers)[2] <- "gene_id"
phase.markers <- left_join(phase.markers, genes[,c(10,13)], by = "gene_id")
g2m <- phase.markers[phase.markers$phase == "G2/M", 4]
g2m
## [1] "Ube2c" "Lbr" "Ctcf" "Cdc20" "Cbx5" "Kif11" "Anp32e"
## [8] "Birc5" "Cdk1" "Tmpo" "Hmmr" "Jpt1" "Pimreg" "Aurkb"
## [15] "Top2a" "Gtse1" "Rangap1" "Cdca3" "Ndc80" "Kif20b" "Cenpf"
## [22] "Nek2" "Nuf2" "Nusap1" "Bub1" "Tpx2" "Aurka" "Ect2"
## [29] "Cks1b" "Kif2c" "Cdca8" "Cenpa" "Mki67" "Ccnb2" "Kif23"
## [36] "Smc4" "G2e3" "Tubb4b" "Anln" "Tacc3" "Dlgap5" "Ckap2"
## [43] "Ncapd2" "Ttk" "Ckap5" "Cdc25c" "Hjurp" "Cenpe" "Ckap2l"
## [50] "Cdca2" "Hmgb2" "Cks2" "Psrc1" "Gas2l3"
s <- phase.markers[phase.markers$phase == "S", 4]
s
## [1] "Cdc45" "Uhrf1" "Mcm2" "Slbp" "Mcm5" "Pola1"
## [7] "Gmnn" "Cdc6" "Rrm2" "Atad2" "Dscc1" "Mcm4"
## [13] "Chaf1b" "Rfc2" "Msh2" "Fen1" "Hells" "Prim1"
## [19] "Tyms" "Mcm6" "Wdr76" "Rad51" "Pcna" "Ccne2"
## [25] "Casp8ap2" "Usp1" "Nasp" "Rpa2" "Ung" "Rad51ap1"
## [31] "Blm" "Pold3" "Rrm1" "Cenpu" "Gins2" "Tipin"
## [37] "Brip1" "Dtl" "Exo1" "Ubr7" "Clspn" "E2f8"
## [43] "Cdca7"
# write table
write.table(phase.markers,
"../../results/cellcycle/mouse_cell_cycle_phase_markers.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# score cells
mouse.phase <- CellCycleScoring(mouse.phase,
g2m.features = g2m,
s.features = s,
set.ident = TRUE)
mouse.filtered$Phase <- mouse.phase$Phase
Find the top variable genes before performing PCA. The data is scaled since highly expressed genes usually are the most variable. This will make the mean expression zero and the variance of each gene across cells is one.
# Identify the most variable genes
mouse.phase <- FindVariableFeatures(mouse.phase, verbose = FALSE)
# Preview top 40
head(VariableFeatures(mouse.phase), 40)
## [1] "Hist1h1b" "Mpz" "Camp" "Acta2" "Ccl21a" "Myh11"
## [7] "Gzma" "Cxcl9" "Retnla" "Mmrn1" "Tagln" "Myoc"
## [13] "Ptgds" "Mcpt4" "Ngp" "Cpa3" "Cma1" "Tpsb2"
## [19] "Ccl5" "Ttr" "Cxcl10" "Hist1h2ap" "Hist1h2ae" "Chil3"
## [25] "Hist1h3c" "Chad" "Igha" "Apod" "Ltf" "Col6a5"
## [31] "Penk" "Lcn2" "Atp1a2" "Tpm2" "Fcnb" "Mmp13"
## [37] "Vtn" "Retnlg" "Olfm4" "Hist1h1a"
# Scale the counts
mouse.phase <- ScaleData(mouse.phase)
## Centering and scaling data matrix
If the PCA plots for each phase do not look similar you may want to regress out variation due to cell cycle phase. Otherwise, nothing needs to be done.
# Run PCA
mouse.phase <- RunPCA(mouse.phase, nfeatures.print = 10)
## PC_ 1
## Positive: Col1a2, Col1a1, Cdh11, Fstl1, Col6a1, Gas1, Bgn, Mgp, Col12a1, Col6a2
## Negative: Lyz2, Cd52, Cd74, Mpeg1, H2-Aa, H2-Ab1, H2-Eb1, C1qb, Alox5ap, Arhgdib
## PC_ 2
## Positive: Flt1, Ptprb, Kdr, Podxl, Pecam1, Adgrf5, Egfl7, Emcn, Adgrl4, Cyyr1
## Negative: Lrp1, Emb, Igf1, Serpinf1, Col1a2, Col12a1, Cdh11, Col1a1, Mpp6, Foxd1
## PC_ 3
## Positive: C1qb, Ms4a7, Fcrls, Pf4, Cbr2, F13a1, Cd163, Hpgd, Igf1, Clec10a
## Negative: Top2a, Hist1h1b, Mki67, Kif11, Nusap1, Birc5, Cdca8, Pclaf, Hist1h3c, Knl1
## PC_ 4
## Positive: S100a8, S100a9, Retnlg, Mmp8, Lcn2, Wfdc21, Hp, Cxcr2, Pglyrp1, Pygl
## Negative: Top2a, Cd79b, Hist1h1b, Vpreb3, Kif11, Cd79a, Knl1, Nusap1, Prc1, Kif15
## PC_ 5
## Positive: Myh11, Acta2, Notch3, Lmod1, Tagln, Myl9, Mustn1, Tpm2, Gucy1b1, Carmn
## Negative: Slc26a7, Col8a2, Slc47a1, Col12a1, Rspo3, Tspan11, Fxyd5, Shisa3, Cpz, Ecrg4
# Plot
pca1 <- DimPlot(mouse.phase,
reduction = "pca",
group.by = "Phase",
split.by = "Phase")
pca1
pca2 <- DimPlot(mouse.phase,
reduction = "pca",
group.by = "Phase",
shuffle = TRUE)
pca2
data <- as.data.frame(table(mouse.phase$Phase))
colnames(data) <- c("Phase","frequency")
ncells3 <- ggplot(data, aes(x = Phase, y = frequency, fill = Phase)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
ggtitle("Cells per phase")
ncells3
percent.phase <- mouse.phase@meta.data %>%
group_by(sample, Phase) %>%
dplyr::count() %>%
group_by(sample) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x = sample, y = percent, fill = Phase)) +
geom_col() +
ggtitle("Percentage of phase per sample")
percent.phase
Evaluating effects of mitochondrial expression
# Check quartile values and store
summary(mouse.phase$percent.mt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.110 5.325 6.787 9.665 19.992
first <- as.numeric(summary(mouse.phase$percent.mt)[2])
mean <- as.numeric(summary(mouse.phase$percent.mt)[4])
third <- as.numeric(summary(mouse.phase$percent.mt)[5])
# Turn percent.mt into factor based on quartile value
mouse.phase@meta.data$mito.factor <- cut(mouse.phase@meta.data$percent.mt,
breaks=c(-Inf, first, mean, third, Inf),
labels=c("Low","Medium","Medium high", "High"))
mouse.filtered[["mito.factor"]] <- mouse.phase$mito.factor
# Plot
pca1 <- DimPlot(mouse.phase,
reduction = "pca",
group.by = "mito.factor",
split.by = "mito.factor")
pca1
pca2 <- DimPlot(mouse.phase,
reduction = "pca",
group.by = "mito.factor",
shuffle = TRUE)
pca2
percent <- mouse.phase@meta.data %>%
group_by(sample, mito.factor) %>%
dplyr::count() %>%
group_by(sample) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x = sample, y = percent, fill = mito.factor)) +
geom_col() +
ggtitle("Mitochondrial fraction per sample")
percent
Now, we can use the SCTransform method as a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes. Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model.
Sctransform automatically accounts for cellular sequencing depth by regressing out sequencing depth (nUMIs). However, if there are other sources of uninteresting variation identified in the data during the exploration steps we can also include these. We observed little to no effect due to cell cycle phase or percent.mito and so we chose not to regress this out of our data.
Since we have 8 samples in our dataset we want to keep them as separate objects and transform them as that is what is required for integration.
# split
mouse.split <- SplitObject(mouse.filtered, split.by = "sample")
mouse.split
## $E3.2M.F
## An object of class Seurat
## 20251 features across 4632 samples within 1 assay
## Active assay: RNA (20251 features, 0 variable features)
##
## $E3.2M.M
## An object of class Seurat
## 20251 features across 4196 samples within 1 assay
## Active assay: RNA (20251 features, 0 variable features)
##
## $E3.14M.F
## An object of class Seurat
## 20251 features across 3762 samples within 1 assay
## Active assay: RNA (20251 features, 0 variable features)
##
## $E3.14M.M
## An object of class Seurat
## 20251 features across 4619 samples within 1 assay
## Active assay: RNA (20251 features, 0 variable features)
##
## $E4.2M.F
## An object of class Seurat
## 20251 features across 3754 samples within 1 assay
## Active assay: RNA (20251 features, 0 variable features)
##
## $E4.2M.M
## An object of class Seurat
## 20251 features across 3201 samples within 1 assay
## Active assay: RNA (20251 features, 0 variable features)
##
## $E4.14M.F
## An object of class Seurat
## 20251 features across 6696 samples within 1 assay
## Active assay: RNA (20251 features, 0 variable features)
##
## $E4.14M.M
## An object of class Seurat
## 20251 features across 4289 samples within 1 assay
## Active assay: RNA (20251 features, 0 variable features)
# cleanup
remove(mouse.phase, mouse.filtered)
We will use a ‘for loop’ to run the SCTransform() on each sample, and regress out mitochondrial expression by specifying in the vars.to.regress argument of the SCTransform() function.
Before we run this for loop, we know that the output can generate large R objects/variables in terms of memory. If we have a large dataset, then we might need to adjust the limit for allowable object sizes within R (Default is 500 * 1024 ^ 2 = 500 Mb).
options(future.globals.maxSize = 4000 * 1024^5)
for (i in 1:length(mouse.split)) {
print(paste0("Sample ", i))
mouse.split[[i]] <- SCTransform(mouse.split[[i]], verbose = FALSE)
}
## [1] "Sample 1"
## [1] "Sample 2"
## [1] "Sample 3"
## [1] "Sample 4"
## [1] "Sample 5"
## [1] "Sample 6"
## [1] "Sample 7"
## [1] "Sample 8"
mouse.split
## $E3.2M.F
## An object of class Seurat
## 37104 features across 4632 samples within 2 assays
## Active assay: SCT (16853 features, 3000 variable features)
## 1 other assay present: RNA
##
## $E3.2M.M
## An object of class Seurat
## 37520 features across 4196 samples within 2 assays
## Active assay: SCT (17269 features, 3000 variable features)
## 1 other assay present: RNA
##
## $E3.14M.F
## An object of class Seurat
## 37542 features across 3762 samples within 2 assays
## Active assay: SCT (17291 features, 3000 variable features)
## 1 other assay present: RNA
##
## $E3.14M.M
## An object of class Seurat
## 38017 features across 4619 samples within 2 assays
## Active assay: SCT (17766 features, 3000 variable features)
## 1 other assay present: RNA
##
## $E4.2M.F
## An object of class Seurat
## 37792 features across 3754 samples within 2 assays
## Active assay: SCT (17541 features, 3000 variable features)
## 1 other assay present: RNA
##
## $E4.2M.M
## An object of class Seurat
## 37263 features across 3201 samples within 2 assays
## Active assay: SCT (17012 features, 3000 variable features)
## 1 other assay present: RNA
##
## $E4.14M.F
## An object of class Seurat
## 38473 features across 6696 samples within 2 assays
## Active assay: SCT (18222 features, 3000 variable features)
## 1 other assay present: RNA
##
## $E4.14M.M
## An object of class Seurat
## 37612 features across 4289 samples within 2 assays
## Active assay: SCT (17361 features, 3000 variable features)
## 1 other assay present: RNA
Condition-specific clustering of cells indicates that we need to integrate the cells across conditions to ensure that cells of the same cell type cluster together.
To integrate, use the shared highly variable genes from each condition identified using SCTransform. Then, integrate conditions to overlay cells that are similar or have a “common set of biological features” between groups.
Now, using our SCTransform object as input, let’s perform the integration across conditions.
First, we need to specify that we want to use all of the 3000 most variable genes identified by SCTransform for the integration. By default, this function selects the top 2000 genes.
# Choose the features to use when integrating multiple datasets
# will use nfeatures as 3000 as defined by running SCTransform above
var.features <- SelectIntegrationFeatures(object.list = mouse.split,
nfeatures = 3000)
# merge the mouse
mouse.merged <- merge(x = mouse.split[[1]],
y = c(mouse.split[[2]], mouse.split[[3]], mouse.split[[4]],
mouse.split[[5]], mouse.split[[6]], mouse.split[[7]], mouse.split[[8]]))
# define the variable features
VariableFeatures(mouse.merged) <- var.features
# run PCA on the merged object
mouse.merged <- RunPCA(object = mouse.merged, assay = "SCT")
# harmony dimensional reduction
mouse.integrated <- RunHarmony(object = mouse.merged,
group.by.vars = "sample",
assay.use = "SCT",
reduction = "pca",
plot_convergence = TRUE)
# save and cleanup
saveRDS(mouse.integrated, "../../rObjects/mouse_integrated.rds")
remove(mouse.split, var.features, mouse.merged)
# Reset idents and levels
DefaultAssay(mouse.integrated) <- "SCT"
Idents(mouse.integrated) <- "sample"
mouse.integrated$sample <- factor(mouse.integrated$sample,
levels = sample_order)
mouse.integrated$age <- factor(mouse.integrated$age,
levels = age_order)
mouse.integrated$sex <- factor(mouse.integrated$sex,
levels = sex_order)
mouse.integrated$isoform <- factor(mouse.integrated$isoform,
levels = isoform_order)
# check PCA
p1 <- DimPlot(object = mouse.integrated,
reduction = "harmony",
group.by = "sample",
cols = sample_colors,
shuffle = TRUE) + NoLegend()
p1
p2 <- VlnPlot(object = mouse.integrated,
features = "harmony_1",
group.by = "sample",
pt.size = 0,
cols = sample_colors) + NoLegend()
p2
Top 20 variable features
top20 <- mouse.integrated@assays$SCT@var.features[1:20]
top20
## [1] "S100a9" "S100a8" "Igfbp5" "Mgp" "Cd74" "Ngp" "Retnlg" "Rgs5"
## [9] "Ltf" "Lcn2" "Cpa3" "Camp" "Tpsb2" "Cma1" "Mcpt4" "Igfbp3"
## [17] "Mmp8" "Col1a2" "H2-Ab1" "Igf2"
After integration, to visualize the integrated data we can use dimensionality reduction techniques, such as PCA and Uniform Manifold Approximation and Projection (UMAP). While PCA will determine all PCs, we can only plot two at a time. In contrast, UMAP will take the information from any number of top PCs to arrange the cells in this multidimensional space. It will take those distances in multidimensional space, and try to plot them in two dimensions. In this way, the distances between cells represent similarity in expression.
To generate these visualizations with the harmony output, use reduction = “harmony”
# Plot PCA
pca1 <- DimPlot(mouse.integrated,
reduction = "harmony",
split.by = "sample",
group.by = "sample",
cols = sample_colors)
pca1
pca2 <- DimPlot(mouse.integrated,
reduction = "harmony",
split.by = "isoform",
group.by = "isoform",
cols = isoform_colors)
pca2
pca3 <- DimPlot(mouse.integrated,
reduction = "harmony",
split.by = "age",
group.by = "age",
cols = age_colors)
pca3
pca4 <- DimPlot(mouse.integrated,
reduction = "harmony",
split.by = "sex",
group.by = "sex",
cols = sex_colors)
pca4
To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.
# Printing out the most variable genes driving PCs
print(x = mouse.integrated[["pca"]],
dims = 1:10,
nfeatures = 10)
## PC_ 1
## Positive: Lyz2, C1qa, Cd74, C1qb, Mrc1, Apoe, H2-Aa, H2-Ab1, C1qc, H2-Eb1
## Negative: Ptprb, Flt1, Mgp, Plpp3, Igfbp7, Sparc, Kdr, Col4a1, Plvap, Podxl
## PC_ 2
## Positive: Col1a2, Igfbp5, Col1a1, Mgp, Col12a1, Slc38a2, Cdh11, Ogn, Ecrg4, Gas1
## Negative: Flt1, Ptprb, Kdr, Plvap, Podxl, Igfbp3, Pecam1, Egfl7, Emcn, Ly6c1
## PC_ 3
## Positive: C1qa, Apoe, Mrc1, C1qb, C1qc, Pf4, Selenop, Ctsb, Stab1, Csf1r
## Negative: S100a9, S100a8, Cd52, S100a6, Hp, Hmgb2, Lcn2, Pglyrp1, S100a11, Retnlg
## PC_ 4
## Positive: S100a9, S100a8, Lyz2, Mmp9, Hp, Lcn2, Retnlg, Pglyrp1, Mmp8, Wfdc21
## Negative: Cd79a, Cd79b, Cd74, Ly6d, Ms4a1, Vpreb3, Ptprcap, Siglecg, Pax5, Spib
## PC_ 5
## Positive: Cd74, H2-Ab1, H2-Aa, H2-Eb1, Ccr2, Mpeg1, Cd52, Ctss, S100a4, Crip1
## Negative: Stab1, Mrc1, Cpa3, Cma1, Tpsb2, Mcpt4, Il1rl1, Serpinb1a, Mrgprb1, Slc18a2
## PC_ 6
## Positive: Cd79a, Cd79b, Ly6d, Vpreb3, Ms4a1, Pax5, Cd24a, Plpp3, Spib, Siglecg
## Negative: Crip1, Myh11, Myl9, Acta2, Tagln, Rgs5, Notch3, Ccr2, Tpm2, Vim
## PC_ 7
## Positive: Il1rl1, Cpa3, Kit, Tpsb2, Cma1, Mcpt4, Mrgprb1, Slc18a2, Ms4a2, Cd200r3
## Negative: Myl9, Myh11, Acta2, Rgs5, Notch3, Tagln, Tpm2, Cd79a, Ebf1, Igfbp7
## PC_ 8
## Positive: Ms4a1, Ly6d, Ltb, Cd37, Cd52, Ikzf3, Ptprc, Cd74, Cd79b, Cd79a
## Negative: Mki67, Top2a, Stmn1, Hist1h1b, Knl1, Kif11, Hmgb2, H2afz, Pclaf, Prc1
## PC_ 9
## Positive: Igfbp3, Igfbp7, Cpa3, Cma1, Mcpt4, Tpsb2, Col4a1, Mrgprb1, Kit, Serpinb1a
## Negative: Vwf, Cfh, Cldn5, Slco1a4, Clec14a, Slco2a1, Adgrg6, Mal, Lrg1, Ptn
## PC_ 10
## Positive: Vwf, Cd74, H2-Ab1, H2-Aa, H2-Eb1, Cd79a, Cfh, Ly6d, Cd79b, Slco1a4
## Negative: Ms4a4b, Gimap3, Trbc2, Il7r, Skap1, Cd3g, Bcl11b, Cd3e, Ccl5, Trbc1
Quantitative approach to an elbow plot
- The point where the principal components only contribute 5% of standard deviation and the principal components cumulatively contribute 90% of the standard deviation.
- The point where the percent change in variation between the consecutive PCs is less than 0.1%.
First metric
# Determine percent of variation associated with each PC
stdv <- mouse.integrated[["pca"]]@stdev
sum.stdv <- sum(mouse.integrated[["pca"]]@stdev)
percent.stdv <- (stdv / sum.stdv) * 100
# Calculate cumulative percents for each PC
cumulative <- cumsum(percent.stdv)
# Determine which PC exhibits cumulative percent greater than 90% and
# and % variation associated with the PC as less than 5
co1 <- which(cumulative > 90 & percent.stdv < 5)[1]
co1
## [1] 41
Second metric
# Determine the difference between variation of PC and subsequent PC
co2 <- sort(which(
(percent.stdv[1:length(percent.stdv) - 1] -
percent.stdv[2:length(percent.stdv)]) > 0.1),
decreasing = T)[1] + 1
# last point where change of % of variation is more than 0.1%.
co2
## [1] 21
Choose the minimum of these two metrics as the PCs covering the majority of the variation in the data.
# Minimum of the two calculation
min.pc <- min(co1, co2)
min.pc
## [1] 21
Use min.pc we just calculated to generate the clusters. We can plot the elbow plot again and overlay the information determined using our metrics:
# Create a dataframe with values
plot_df <- data.frame(pct = percent.stdv,
cumu = cumulative,
rank = 1:length(percent.stdv))
# Elbow plot to visualize
ggplot(plot_df, aes(cumulative, percent.stdv, label = rank, color = rank > min.pc)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(percent.stdv[percent.stdv > 5]), color = "grey") +
theme_bw()
# Run UMAP
mouse.integrated <- RunUMAP(mouse.integrated,
dims = 1:min.pc,
reduction = "harmony",
n.components = 3) # set to 3 to use with VR
# plot UMAP
DimPlot(mouse.integrated,
shuffle = TRUE)
Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].
We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.
The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.
# Determine the K-nearest neighbor graph
mouse.unannotated <- FindNeighbors(object = mouse.integrated,
assay = "SCT", # set as default after SCTransform
reduction = "harmony",
dims = 1:min.pc)
# Determine the clusters for various resolutions
mouse.unannotated <- FindClusters(object = mouse.unannotated,
algorithm = 1, # 1 = Louvain
resolution = seq(0.1,0.8,by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35149
## Number of edges: 1187040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9755
## Number of communities: 10
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35149
## Number of edges: 1187040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9631
## Number of communities: 14
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35149
## Number of edges: 1187040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9547
## Number of communities: 19
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35149
## Number of edges: 1187040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9477
## Number of communities: 22
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35149
## Number of edges: 1187040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9419
## Number of communities: 25
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35149
## Number of edges: 1187040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9367
## Number of communities: 28
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35149
## Number of edges: 1187040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9317
## Number of communities: 29
## Elapsed time: 4 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 35149
## Number of edges: 1187040
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9270
## Number of communities: 33
## Elapsed time: 4 seconds
mouse.unannotated$seurat_clusters <- mouse.unannotated$SCT_snn_res.0.5
# 0.1
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.1",
label = TRUE)
# 0.2
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.2",
label = TRUE)
# 0.3
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.3",
label = TRUE)
# 0.4
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.4",
label = TRUE)
# 0.5
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.5",
label = TRUE)
# clusters
Idents(mouse.unannotated) <- "SCT_snn_res.0.5"
u1 <- DimPlot(mouse.unannotated,
label = FALSE,
split.by = "sample",
#cols = colors,
ncol = 2)
u1
# isoform
u2 <- DimPlot(mouse.unannotated,
label = FALSE,
split.by = "isoform",
group.by = "isoform",
cols = isoform_colors)
u2
# age
u3 <- DimPlot(mouse.unannotated,
label = FALSE,
split.by = "age",
group.by = "age",
cols = age_colors)
u3
# sex
u4 <- DimPlot(mouse.unannotated,
label = FALSE,
split.by = "sex",
group.by = "sex",
cols = sex_colors)
u4
# phase
u5 <- DimPlot(mouse.unannotated,
label = FALSE,
split.by = "Phase",
group.by = "Phase")
u5
# mito.factor
u6 <- DimPlot(mouse.unannotated,
label = FALSE,
split.by = "mito.factor",
group.by = "mito.factor",
ncol = 2)
u6
# nCount
f1 <- FeaturePlot(mouse.unannotated,
features = "nCount_RNA",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f1
# nFeature
f2 <- FeaturePlot(mouse.unannotated,
features = "nFeature_RNA",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f2
# percent.mt
f3 <- FeaturePlot(mouse.unannotated,
features = "percent.mt",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f3
# cell.complexity
f4 <- FeaturePlot(mouse.unannotated,
features = "cell.complexity",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f4
# percent.ribo
f5 <- FeaturePlot(mouse.unannotated,
features = "percent.ribo.protein",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f5
# percent.hb
f6 <- FeaturePlot(mouse.unannotated,
features = "percent.hb",
pt.size = 0.4,
order = TRUE) +
scale_colour_gradientn(colours = c("blue","lightblue","yellow","orange","red"))
f6
# sample
b1 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, sample) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=sample)) +
theme_classic() +
geom_col() +
scale_fill_manual(values = sample_colors) +
ggtitle("Percentage of sample per cluster")
b1
# isoform
b2 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, isoform) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=isoform)) +
theme_classic() +
geom_col() +
scale_fill_manual(values = isoform_colors) +
ggtitle("Percentage of isoform per cluster")
b2
# age
b3 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, age) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=age)) +
theme_classic() +
geom_col() +
scale_fill_manual(values = age_colors) +
ggtitle("Percentage of age per cluster")
b3
# sex
b4 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, sex) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=sex)) +
theme_classic() +
geom_col() +
scale_fill_manual(values = sex_colors) +
ggtitle("Percentage of sex per cluster")
b4
# Phase
b5 <- mouse.unannotated@meta.data %>%
group_by(seurat_clusters, Phase) %>%
dplyr::count() %>%
group_by(seurat_clusters) %>%
dplyr::mutate(percent = 100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=Phase)) +
theme_classic() +
geom_col() +
ggtitle("Percentage of phase per cluster")
b5
# sample
sample_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "sample")) %>%
dplyr::count(ident, sample) %>%
tidyr::spread(ident, n)
sample_ncells
## sample 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1 E3.2M.M 300 612 370 249 366 239 201 140 220 247 98 149 110 111 58 124 143
## 2 E3.2M.F 458 693 160 296 260 261 249 180 170 298 182 220 132 142 100 121 115
## 3 E4.2M.M 341 277 289 163 197 166 205 290 135 137 103 81 103 123 77 43 50
## 4 E4.2M.F 406 379 273 269 313 269 296 157 124 119 144 73 136 122 101 70 81
## 5 E3.14M.M 490 335 295 312 275 249 274 327 282 206 216 113 184 199 96 85 93
## 6 E3.14M.F 233 368 300 363 335 294 157 355 173 183 152 154 62 11 77 105 96
## 7 E4.14M.M 433 233 369 278 256 250 281 329 310 189 167 92 120 144 94 114 90
## 8 E4.14M.F 691 447 587 527 431 562 355 121 456 294 300 200 216 118 362 192 181
## 17 18 19 20 21 22 23 24
## 1 177 41 56 60 43 31 26 25
## 2 112 75 75 85 114 102 18 14
## 3 87 54 59 40 48 86 25 22
## 4 118 61 53 33 31 58 37 31
## 5 97 113 97 66 88 53 45 29
## 6 54 103 64 58 28 11 13 13
## 7 52 126 81 72 70 61 50 28
## 8 62 159 85 128 85 50 52 35
write.table(sample_ncells,
"../../results/ncells/cells_per_cluster_per_sample_uannotated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# age
age_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "age")) %>%
dplyr::count(ident, age) %>%
tidyr::spread(ident, n)
age_ncells
## age 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 1 2 months 1505 1961 1092 977 1136 935 951 767 649 801 527 523 481 498
## 2 14 months 1847 1383 1551 1480 1297 1355 1067 1132 1221 872 835 559 582 472
## 14 15 16 17 18 19 20 21 22 23 24
## 1 336 358 389 494 231 243 218 236 277 106 92
## 2 629 496 460 265 501 327 324 271 175 160 105
write.table(age_ncells,
"../../results/ncells/cells_per_cluster_per_age_unannotated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# sex
sex_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "sex")) %>%
dplyr::count(ident, sex) %>%
tidyr::spread(ident, n)
sex_ncells
## sex 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 Male 1564 1457 1323 1002 1094 904 961 1086 947 779 584 435 517 577 325
## 2 Female 1788 1887 1320 1455 1339 1386 1057 813 923 894 778 647 546 393 640
## 15 16 17 18 19 20 21 22 23 24
## 1 366 376 413 334 293 238 249 231 146 104
## 2 488 473 346 398 277 304 258 221 120 93
write.table(sex_ncells,
"../../results/ncells/cells_per_cluster_per_sex_unannoated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# isoform
isoform_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "isoform")) %>%
dplyr::count(ident, isoform) %>%
tidyr::spread(ident, n)
isoform_ncells
## isoform 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1 E3 1481 2008 1125 1220 1236 1043 881 1002 845 934 648 636 488 463 331
## 2 E4 1871 1336 1518 1237 1197 1247 1137 897 1025 739 714 446 575 507 634
## 15 16 17 18 19 20 21 22 23 24
## 1 435 447 440 332 292 269 273 197 102 81
## 2 419 402 319 400 278 273 234 255 164 116
write.table(isoform_ncells,
"../../results/ncells/cells_per_cluster_per_isoform_unannotated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)
# phase
phase_ncells <- FetchData(mouse.unannotated,
vars = c("ident", "Phase")) %>%
dplyr::count(ident, Phase) %>%
tidyr::spread(ident, n)
phase_ncells
## Phase 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1 G1 1994 1951 880 2166 1100 1348 884 283 445 580 814 516 801 145 459 190
## 2 G2M 460 425 772 115 323 496 544 1446 591 477 275 242 100 353 337 305
## 3 S 898 968 991 176 1010 446 590 170 834 616 273 324 162 472 169 359
## 16 17 18 19 20 21 22 23 24
## 1 413 208 456 129 NA 159 109 67 100
## 2 176 202 117 281 432 171 198 69 37
## 3 260 349 159 160 110 177 145 130 60
write.table(phase_ncells,
"../../results/ncells/cells_per_cluster_per_phase_unannotated.tsv",
quote = FALSE, sep = "\t", row.names = FALSE)